Data source

I used the weather data provided for German towns on the following website: https://klima.org/deutschland/alle-towns.html

The variables are town name, mean air temperature of the hottest and coldest month in degrees Celsius, mean number of days with at least 1.0 mm of precipitation, the mean of the sum of mean daily sunshine hours per month over all 12 months, and the mean water temperature in degrees Celsius.

data <- read.csv("./project_files/weather_data.csv", sep = "", header = TRUE)
head(data)

Data cleaning

I transform the variable $sun into mean daily sunshine duration in hours and omit the variable $twater.

data$sun <- round(data$sun / 12, 2)
data <- data[, 1:5]

Data exploration

I create histograms for each measurement variable against the density function of the normal distribution to get an overview over the data.

hist(data$rain, 
     breaks = c(105:199), 
     xlab = "Number of rainy days", 
     main = NULL, 
     freq = FALSE)
lines(c(105:199), dnorm(c(105:199), 
            mean = mean(data$rain), 
            sd = sqrt(var(data$rain))),
      col = "red")

hist(data$sun, 
     breaks = seq(3.1, 5.6, 0.1), 
     xlab = "Sunshine duration [h]", 
     main = NULL, 
     freq = FALSE)
lines(seq(3.1, 5.6, 0.1), dnorm(seq(3.1, 5.6, 0.1), 
            mean = mean(data$sun), 
            sd = sqrt(var(data$sun))),
      col = "red")

hist(data$tmin, 
     breaks = c(-9:0), 
     xlab = "Minimum temperature [°C]", 
     main = NULL, 
     freq = FALSE)
lines(c(-9:0), dnorm(c(-9:0), 
            mean = mean(data$tmin), 
            sd = sqrt(var(data$tmin))),
      col = "red")

hist(data$tmax, 
     breaks = c(16:28), 
     xlab = "Maximum temperature [°C]",
     main = NULL, 
     freq = FALSE)
lines(c(16:28), dnorm(c(16:28), 
            mean = mean(data$tmax), 
            sd = sqrt(var(data$tmax))),
      col = "red")

At this point, it is pretty clear that none of the variables visually match the density function of the normal distribution. Especially the distribution of $rain doesn’t resemble any distribution I known. I decide to test for normality with the Shapiro-Wilk test for all variables anyway.

ifelse(shapiro.test(data$rain)$p.value > 0.05, "data$rain is normal", "data$rain is non-normal")
## [1] "data$rain is non-normal"
ifelse(shapiro.test(data$sun)$p.value > 0.05, "data$sun is normal", "data$sun is non-normal")
## [1] "data$sun is non-normal"
ifelse(shapiro.test(data$tmin)$p.value > 0.05, "data$tmin is normal", "data$tmin is non-normal")
## [1] "data$tmin is non-normal"
ifelse(shapiro.test(data$tmax)$p.value > 0.05, "data$tmax is normal", "data$tmax is non-normal")
## [1] "data$tmax is non-normal"

None of the variables are normally distributed.

cor(data$rain,data$sun)
## [1] 0.0130903
plot(data$rain,data$sun, pch = 19, frame = FALSE)
abline(lm(data$sun ~ data$rain, data = data), col = "grey")

library("ggplot2")

data_cooc <- rep(0, ((max(data$rain) - min(data$rain) + 1) * 
                       (max(data$sun) - min(data$sun) + 1))) # 106[38:66]:199[38:66]

for (i in 1:length(data$rain)) {
  data_cooc[(data$rain[i] - min(data$rain) + 1) * 
              (max(data$sun) - min(data$sun) + 1) - 1 - 
              (max(data$sun) - data$sun[i])] 
  <- data_cooc[(data$rain[i] - min(data$rain) + 1) * 
                 (max(data$sun) - min(data$sun) + 1) - 1 - 
                 (max(data$sun) - data$sun[i])] + 1
}

for (i in 1:length(data$rain)) {
  data$freq[i] <- data_cooc[(data$rain[i] - min(data$rain) + 1) * 
                              (max(data$sun) - min(data$sun) + 1) - 1 - 
                              (max(data$sun) - data$sun[i])]
}

ggplot(data = data, aes(x = rain, y = sun, size = freq)) + geom_count()

library("plotly")
  
data$twin_towns <- rep("", length(data$rain))

for (i in 1:length(data$rain)) {
  for (j in 1:length(data$rain)) {
    if (data$rain[i] == data$rain[j] & data$sun[i] == data$sun[j]) {
      data$twin_towns[i] <- paste(data$twin_towns[i], data$towns[j], sep=", ")
    }
  }
}

data$twin_towns <- substr(data$twin_towns, 3, 1000)

data <- data %>%
  mutate(Text = paste0("Towns: ", twin_towns, "\n", 
                       "Rain: ", rain, "\n", 
                       "Sun: ", sun, "\n", 
                       "Frequency: ", freq))

plot_ly(data, x = data$rain, y = data$sun, z = data$freq, 
        text = data$Text, hoverinfo = "text", 
        colors = "Greys", type = "heatmap")